import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings("ignore")
data = pd.read_csv("../Spotify Music Dataset/data/data.csv")
genre_data = pd.read_csv('../Spotify Music Dataset/data_by_genres.csv')
year_data = pd.read_csv('../Spotify Music Dataset/data_by_year.csv')
artist_data = pd.read_csv('../Spotify Music Dataset/data_by_artist/data_by_artist.csv')
data.head(5)
| valence | year | acousticness | artists | danceability | duration_ms | energy | explicit | id | instrumentalness | key | liveness | loudness | mode | name | popularity | release_date | speechiness | tempo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0594 | 1921 | 0.982 | ['Sergei Rachmaninoff', 'James Levine', 'Berli... | 0.279 | 831667 | 0.211 | 0 | 4BJqT0PrAfrxzMOxytFOIz | 0.878000 | 10 | 0.665 | -20.096 | 1 | Piano Concerto No. 3 in D Minor, Op. 30: III. ... | 4 | 1921 | 0.0366 | 80.954 |
| 1 | 0.9630 | 1921 | 0.732 | ['Dennis Day'] | 0.819 | 180533 | 0.341 | 0 | 7xPhfUan2yNtyFG0cUWkt8 | 0.000000 | 7 | 0.160 | -12.441 | 1 | Clancy Lowered the Boom | 5 | 1921 | 0.4150 | 60.936 |
| 2 | 0.0394 | 1921 | 0.961 | ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... | 0.328 | 500062 | 0.166 | 0 | 1o6I8BglA6ylDMrIELygv1 | 0.913000 | 3 | 0.101 | -14.850 | 1 | Gati Bali | 5 | 1921 | 0.0339 | 110.339 |
| 3 | 0.1650 | 1921 | 0.967 | ['Frank Parker'] | 0.275 | 210000 | 0.309 | 0 | 3ftBPsC5vPBKxYSee08FDH | 0.000028 | 5 | 0.381 | -9.316 | 1 | Danny Boy | 3 | 1921 | 0.0354 | 100.109 |
| 4 | 0.2530 | 1921 | 0.957 | ['Phil Regan'] | 0.418 | 166693 | 0.193 | 0 | 4d6HGyGT8e121BsdKmw9v6 | 0.000002 | 3 | 0.229 | -10.096 | 1 | When Irish Eyes Are Smiling | 2 | 1921 | 0.0380 | 101.665 |
data.describe()
| valence | year | acousticness | danceability | duration_ms | energy | explicit | instrumentalness | key | liveness | loudness | mode | popularity | speechiness | tempo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 1.706530e+05 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 | 170653.000000 |
| mean | 0.528587 | 1976.787241 | 0.502115 | 0.537396 | 2.309483e+05 | 0.482389 | 0.084575 | 0.167010 | 5.199844 | 0.205839 | -11.467990 | 0.706902 | 31.431794 | 0.098393 | 116.861590 |
| std | 0.263171 | 25.917853 | 0.376032 | 0.176138 | 1.261184e+05 | 0.267646 | 0.278249 | 0.313475 | 3.515094 | 0.174805 | 5.697943 | 0.455184 | 21.826615 | 0.162740 | 30.708533 |
| min | 0.000000 | 1921.000000 | 0.000000 | 0.000000 | 5.108000e+03 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | -60.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 0.317000 | 1956.000000 | 0.102000 | 0.415000 | 1.698270e+05 | 0.255000 | 0.000000 | 0.000000 | 2.000000 | 0.098800 | -14.615000 | 0.000000 | 11.000000 | 0.034900 | 93.421000 |
| 50% | 0.540000 | 1977.000000 | 0.516000 | 0.548000 | 2.074670e+05 | 0.471000 | 0.000000 | 0.000216 | 5.000000 | 0.136000 | -10.580000 | 1.000000 | 33.000000 | 0.045000 | 114.729000 |
| 75% | 0.747000 | 1999.000000 | 0.893000 | 0.668000 | 2.624000e+05 | 0.703000 | 0.000000 | 0.102000 | 8.000000 | 0.261000 | -7.183000 | 1.000000 | 48.000000 | 0.075600 | 135.537000 |
| max | 1.000000 | 2020.000000 | 0.996000 | 0.988000 | 5.403500e+06 | 1.000000 | 1.000000 | 1.000000 | 11.000000 | 1.000000 | 3.855000 | 1.000000 | 100.000000 | 0.970000 | 243.507000 |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 170653 entries, 0 to 170652 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 valence 170653 non-null float64 1 year 170653 non-null int64 2 acousticness 170653 non-null float64 3 artists 170653 non-null object 4 danceability 170653 non-null float64 5 duration_ms 170653 non-null int64 6 energy 170653 non-null float64 7 explicit 170653 non-null int64 8 id 170653 non-null object 9 instrumentalness 170653 non-null float64 10 key 170653 non-null int64 11 liveness 170653 non-null float64 12 loudness 170653 non-null float64 13 mode 170653 non-null int64 14 name 170653 non-null object 15 popularity 170653 non-null int64 16 release_date 170653 non-null object 17 speechiness 170653 non-null float64 18 tempo 170653 non-null float64 dtypes: float64(9), int64(6), object(4) memory usage: 24.7+ MB
genre_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2973 entries, 0 to 2972 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mode 2973 non-null int64 1 genres 2973 non-null object 2 acousticness 2973 non-null float64 3 danceability 2973 non-null float64 4 duration_ms 2973 non-null float64 5 energy 2973 non-null float64 6 instrumentalness 2973 non-null float64 7 liveness 2973 non-null float64 8 loudness 2973 non-null float64 9 speechiness 2973 non-null float64 10 tempo 2973 non-null float64 11 valence 2973 non-null float64 12 popularity 2973 non-null float64 13 key 2973 non-null int64 dtypes: float64(11), int64(2), object(1) memory usage: 325.3+ KB
year_data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 100 entries, 0 to 99 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mode 100 non-null int64 1 year 100 non-null int64 2 acousticness 100 non-null float64 3 danceability 100 non-null float64 4 duration_ms 100 non-null float64 5 energy 100 non-null float64 6 instrumentalness 100 non-null float64 7 liveness 100 non-null float64 8 loudness 100 non-null float64 9 speechiness 100 non-null float64 10 tempo 100 non-null float64 11 valence 100 non-null float64 12 popularity 100 non-null float64 13 key 100 non-null int64 dtypes: float64(11), int64(3) memory usage: 11.1 KB
plt.figure(figsize=(14,6))
boxplot = data.boxplot(column=['valence','acousticness','danceability','energy','explicit','instrumentalness','liveness','speechiness'])
plt.show()
# Compute the correlation matrix
corr = data.corr()
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
f, ax = plt.subplots(figsize=(7, 9)) # Set up the matplotlib figure
cmap = sns.diverging_palette(230, 20, as_cmap=True) # Generate a custom diverging colormap
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
!pip install yellowbrick
Requirement already satisfied: yellowbrick in c:\users\prakh\anaconda3\lib\site-packages (1.5) Requirement already satisfied: numpy>=1.16.0 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (1.23.5) Requirement already satisfied: cycler>=0.10.0 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (0.11.0) Requirement already satisfied: scikit-learn>=1.0.0 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (1.2.1) Requirement already satisfied: scipy>=1.0.0 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (1.10.0) Requirement already satisfied: matplotlib!=3.0.0,>=2.0.2 in c:\users\prakh\anaconda3\lib\site-packages (from yellowbrick) (3.7.0) Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.4.4) Requirement already satisfied: pyparsing>=2.3.1 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (3.0.9) Requirement already satisfied: fonttools>=4.22.0 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (4.25.0) Requirement already satisfied: contourpy>=1.0.1 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.0.5) Requirement already satisfied: python-dateutil>=2.7 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (2.8.2) Requirement already satisfied: pillow>=6.2.0 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (9.4.0) Requirement already satisfied: packaging>=20.0 in c:\users\prakh\anaconda3\lib\site-packages (from matplotlib!=3.0.0,>=2.0.2->yellowbrick) (22.0) Requirement already satisfied: threadpoolctl>=2.0.0 in c:\users\prakh\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (2.2.0) Requirement already satisfied: joblib>=1.1.1 in c:\users\prakh\anaconda3\lib\site-packages (from scikit-learn>=1.0.0->yellowbrick) (1.1.1) Requirement already satisfied: six>=1.5 in c:\users\prakh\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib!=3.0.0,>=2.0.2->yellowbrick) (1.16.0)
from yellowbrick.target import FeatureCorrelation
feature_names = ['popularity', 'acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness',
'loudness', 'speechiness', 'tempo', 'valence', 'duration_ms', 'explicit', 'key', 'mode', 'year']
X, Y = data[feature_names], data['popularity']
# Create a list of the feature names
features = np.array(feature_names)
# Instantiate the visualizer
visualizer = FeatureCorrelation(labels=features)
plt.rcParams['figure.figsize']=(10,5)
visualizer.fit(X, Y) # Fit the data to the visualizer
visualizer.show()
<Axes: title={'center': 'Features correlation with dependent variable'}, xlabel='Pearson Correlation'>
def year_to_decade(year):
period_start = int(year/10) * 10
year_decade = '{}s'.format(period_start)
return year_decade
data['decade'] = data['year'].apply(year_to_decade)
sns.set(rc={'figure.figsize':(11 ,6)})
sns.countplot(data=data, x='decade')
<Axes: xlabel='decade', ylabel='count'>
music_features = ['acousticness', 'danceability', 'energy', 'instrumentalness', 'liveness', 'valence']
fig = px.line(year_data, x='year', y=music_features,title='Trend of various music features over decades',
width=900, height=500)
fig.show()
fig = px.line(year_data, x='year', y='loudness',title='Trend of loudness in the music over decades')
fig.show()
top10_genres = genre_data.nlargest(10, 'popularity')
fig = px.bar(top10_genres, x='genres', y=['valence', 'energy', 'danceability', 'acousticness'], barmode='stack',
title='Trend of various music features over top 10 genres', width=900, height=650)
fig.show()
Conclusion :
a. Acousticness and Energy has a high correlation compared to other music features
b. The majority of the songs fall within the time range of 1950s to 2010s.
c. The level of energy in songs has progressively risen over time, while the instrumentalness has declined.
d. The level of acousticness in songs has significantly decreased over the years, particularly since the 1960s.
e. The trend of increasing loudness in songs is unmistakable and has reached its zenith in 2020.
f. Among the top 10 genres, energy and danceability are the most prominent characteristics.
top10_popular_artists = artist_data.nlargest(10, 'popularity')
print('Top 10 Artists that have the most popularity:')
top10_popular_artists[['popularity','artists']].sort_values('popularity',ascending=False)
Top 10 Artists that have the most popularity:
| popularity | artists | |
|---|---|---|
| 20966 | 93.0 | Ritt Momney |
| 14354 | 92.0 | Lele Pons |
| 15070 | 90.0 | Los Legendarios |
| 11764 | 89.0 | Jerry Di |
| 7463 | 88.0 | Emilee |
| 23687 | 88.0 | Surf Mesa |
| 28263 | 88.0 | salem ilese |
| 213 | 87.0 | A7S |
| 2343 | 86.0 | Beltito |
| 14378 | 86.0 | Lenny Santos |
top10_most_songs_artists = artist_data.nlargest(10, 'count')
print('Top 10 Artists that produced most songs:')
top10_most_songs_artists[['count','artists']].sort_values('count',ascending=False)
Top 10 Artists that produced most songs:
| count | artists | |
|---|---|---|
| 8367 | 3169 | Francisco Canaro |
| 28561 | 2422 | Эрнест Хемингуэй |
| 28560 | 2136 | Эрих Мария Ремарк |
| 8434 | 1459 | Frank Sinatra |
| 10714 | 1256 | Ignacio Corsini |
| 27109 | 1200 | Vladimir Horowitz |
| 1682 | 1146 | Arturo Toscanini |
| 2707 | 1103 | Billie Holiday |
| 12378 | 1061 | Johnny Cash |
| 7426 | 1023 | Elvis Presley |
Based on the findings of the exploratory data analysis (EDA), the following conclusions can be drawn:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import numpy as np
kmeans = KMeans(n_clusters=10)
#kmeans.set_params(n__jobs=-1) # set number of jobs for parallel computing
cluster_pipeline = Pipeline([('scaler', StandardScaler()), ('kmeans', kmeans)])
X = genre_data.select_dtypes(np.number)
cluster_pipeline.fit(X)
Pipeline(steps=[('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=10))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=10))])StandardScaler()
KMeans(n_clusters=10)
genre_data['cluster'] = cluster_pipeline.predict(X)
genre_data
| mode | genres | acousticness | danceability | duration_ms | energy | instrumentalness | liveness | loudness | speechiness | tempo | valence | popularity | key | cluster | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 21st century classical | 0.979333 | 0.162883 | 1.602977e+05 | 0.071317 | 0.606834 | 0.361600 | -31.514333 | 0.040567 | 75.336500 | 0.103783 | 27.833333 | 6 | 9 |
| 1 | 1 | 432hz | 0.494780 | 0.299333 | 1.048887e+06 | 0.450678 | 0.477762 | 0.131000 | -16.854000 | 0.076817 | 120.285667 | 0.221750 | 52.500000 | 5 | 4 |
| 2 | 1 | 8-bit | 0.762000 | 0.712000 | 1.151770e+05 | 0.818000 | 0.876000 | 0.126000 | -9.180000 | 0.047000 | 133.444000 | 0.975000 | 48.000000 | 7 | 3 |
| 3 | 1 | [] | 0.651417 | 0.529093 | 2.328809e+05 | 0.419146 | 0.205309 | 0.218696 | -12.288965 | 0.107872 | 112.857352 | 0.513604 | 20.859882 | 7 | 1 |
| 4 | 1 | a cappella | 0.676557 | 0.538961 | 1.906285e+05 | 0.316434 | 0.003003 | 0.172254 | -12.479387 | 0.082851 | 112.110362 | 0.448249 | 45.820071 | 7 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2968 | 1 | zolo | 0.222625 | 0.547082 | 2.580991e+05 | 0.610240 | 0.143872 | 0.204206 | -11.295878 | 0.061088 | 125.494919 | 0.596155 | 33.778943 | 9 | 3 |
| 2969 | 0 | zouglou | 0.161000 | 0.863000 | 2.063200e+05 | 0.909000 | 0.000000 | 0.108000 | -5.985000 | 0.081300 | 119.038000 | 0.845000 | 58.000000 | 7 | 2 |
| 2970 | 1 | zouk | 0.263261 | 0.748889 | 3.060728e+05 | 0.622444 | 0.257227 | 0.089678 | -10.289222 | 0.038778 | 101.965222 | 0.824111 | 46.666667 | 5 | 3 |
| 2971 | 0 | zurich indie | 0.993000 | 0.705667 | 1.984173e+05 | 0.172667 | 0.468633 | 0.179667 | -11.453333 | 0.348667 | 91.278000 | 0.739000 | 0.000000 | 7 | 1 |
| 2972 | 1 | zydeco | 0.421038 | 0.629409 | 1.716717e+05 | 0.609369 | 0.019248 | 0.255877 | -9.854825 | 0.050491 | 126.366087 | 0.808544 | 30.261905 | 7 | 3 |
2973 rows × 15 columns
# Visualizing the Clusters with t-SNE
from sklearn.manifold import TSNE
tsne_pipeline = Pipeline([('scaler', StandardScaler()), ('tsne', TSNE(n_components=2, verbose=1))])
genre_embedding = tsne_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=genre_embedding)
projection['genres'] = genre_data['genres']
projection['cluster'] = genre_data['cluster']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'genres'])
fig.show()
[t-SNE] Computing 91 nearest neighbors... [t-SNE] Indexed 2973 samples in 0.013s... [t-SNE] Computed neighbors for 2973 samples in 0.683s... [t-SNE] Computed conditional probabilities for sample 1000 / 2973 [t-SNE] Computed conditional probabilities for sample 2000 / 2973 [t-SNE] Computed conditional probabilities for sample 2973 / 2973 [t-SNE] Mean sigma: 0.777516 [t-SNE] KL divergence after 250 iterations with early exaggeration: 76.106277 [t-SNE] KL divergence after 1000 iterations: 1.391782
song_cluster_pipeline = Pipeline([('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=20,
verbose=False))
], verbose=False)
X = data.select_dtypes(np.number)
number_cols = list(X.columns)
song_cluster_pipeline.fit(X)
Pipeline(steps=[('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=20, verbose=False))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. Pipeline(steps=[('scaler', StandardScaler()),
('kmeans', KMeans(n_clusters=20, verbose=False))])StandardScaler()
KMeans(n_clusters=20, verbose=False)
X
| valence | year | acousticness | danceability | duration_ms | energy | explicit | instrumentalness | key | liveness | loudness | mode | popularity | speechiness | tempo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0594 | 1921 | 0.98200 | 0.279 | 831667 | 0.211 | 0 | 0.878000 | 10 | 0.6650 | -20.096 | 1 | 4 | 0.0366 | 80.954 |
| 1 | 0.9630 | 1921 | 0.73200 | 0.819 | 180533 | 0.341 | 0 | 0.000000 | 7 | 0.1600 | -12.441 | 1 | 5 | 0.4150 | 60.936 |
| 2 | 0.0394 | 1921 | 0.96100 | 0.328 | 500062 | 0.166 | 0 | 0.913000 | 3 | 0.1010 | -14.850 | 1 | 5 | 0.0339 | 110.339 |
| 3 | 0.1650 | 1921 | 0.96700 | 0.275 | 210000 | 0.309 | 0 | 0.000028 | 5 | 0.3810 | -9.316 | 1 | 3 | 0.0354 | 100.109 |
| 4 | 0.2530 | 1921 | 0.95700 | 0.418 | 166693 | 0.193 | 0 | 0.000002 | 3 | 0.2290 | -10.096 | 1 | 2 | 0.0380 | 101.665 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 170648 | 0.6080 | 2020 | 0.08460 | 0.786 | 301714 | 0.808 | 0 | 0.000289 | 7 | 0.0822 | -3.702 | 1 | 72 | 0.0881 | 105.029 |
| 170649 | 0.7340 | 2020 | 0.20600 | 0.717 | 150654 | 0.753 | 0 | 0.000000 | 7 | 0.1010 | -6.020 | 1 | 68 | 0.0605 | 137.936 |
| 170650 | 0.6370 | 2020 | 0.10100 | 0.634 | 211280 | 0.858 | 0 | 0.000009 | 4 | 0.2580 | -2.226 | 0 | 76 | 0.0809 | 91.688 |
| 170651 | 0.1950 | 2020 | 0.00998 | 0.671 | 337147 | 0.623 | 1 | 0.000008 | 2 | 0.6430 | -7.161 | 1 | 70 | 0.3080 | 75.055 |
| 170652 | 0.6420 | 2020 | 0.13200 | 0.856 | 189507 | 0.721 | 1 | 0.004710 | 7 | 0.1820 | -4.928 | 1 | 74 | 0.1080 | 94.991 |
170653 rows × 15 columns
data
| valence | year | acousticness | artists | danceability | duration_ms | energy | explicit | id | instrumentalness | key | liveness | loudness | mode | name | popularity | release_date | speechiness | tempo | decade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.0594 | 1921 | 0.98200 | ['Sergei Rachmaninoff', 'James Levine', 'Berli... | 0.279 | 831667 | 0.211 | 0 | 4BJqT0PrAfrxzMOxytFOIz | 0.878000 | 10 | 0.6650 | -20.096 | 1 | Piano Concerto No. 3 in D Minor, Op. 30: III. ... | 4 | 1921 | 0.0366 | 80.954 | 1920s |
| 1 | 0.9630 | 1921 | 0.73200 | ['Dennis Day'] | 0.819 | 180533 | 0.341 | 0 | 7xPhfUan2yNtyFG0cUWkt8 | 0.000000 | 7 | 0.1600 | -12.441 | 1 | Clancy Lowered the Boom | 5 | 1921 | 0.4150 | 60.936 | 1920s |
| 2 | 0.0394 | 1921 | 0.96100 | ['KHP Kridhamardawa Karaton Ngayogyakarta Hadi... | 0.328 | 500062 | 0.166 | 0 | 1o6I8BglA6ylDMrIELygv1 | 0.913000 | 3 | 0.1010 | -14.850 | 1 | Gati Bali | 5 | 1921 | 0.0339 | 110.339 | 1920s |
| 3 | 0.1650 | 1921 | 0.96700 | ['Frank Parker'] | 0.275 | 210000 | 0.309 | 0 | 3ftBPsC5vPBKxYSee08FDH | 0.000028 | 5 | 0.3810 | -9.316 | 1 | Danny Boy | 3 | 1921 | 0.0354 | 100.109 | 1920s |
| 4 | 0.2530 | 1921 | 0.95700 | ['Phil Regan'] | 0.418 | 166693 | 0.193 | 0 | 4d6HGyGT8e121BsdKmw9v6 | 0.000002 | 3 | 0.2290 | -10.096 | 1 | When Irish Eyes Are Smiling | 2 | 1921 | 0.0380 | 101.665 | 1920s |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 170648 | 0.6080 | 2020 | 0.08460 | ['Anuel AA', 'Daddy Yankee', 'KAROL G', 'Ozuna... | 0.786 | 301714 | 0.808 | 0 | 0KkIkfsLEJbrcIhYsCL7L5 | 0.000289 | 7 | 0.0822 | -3.702 | 1 | China | 72 | 2020-05-29 | 0.0881 | 105.029 | 2020s |
| 170649 | 0.7340 | 2020 | 0.20600 | ['Ashnikko'] | 0.717 | 150654 | 0.753 | 0 | 0OStKKAuXlxA0fMH54Qs6E | 0.000000 | 7 | 0.1010 | -6.020 | 1 | Halloweenie III: Seven Days | 68 | 2020-10-23 | 0.0605 | 137.936 | 2020s |
| 170650 | 0.6370 | 2020 | 0.10100 | ['MAMAMOO'] | 0.634 | 211280 | 0.858 | 0 | 4BZXVFYCb76Q0Klojq4piV | 0.000009 | 4 | 0.2580 | -2.226 | 0 | AYA | 76 | 2020-11-03 | 0.0809 | 91.688 | 2020s |
| 170651 | 0.1950 | 2020 | 0.00998 | ['Eminem'] | 0.671 | 337147 | 0.623 | 1 | 5SiZJoLXp3WOl3J4C8IK0d | 0.000008 | 2 | 0.6430 | -7.161 | 1 | Darkness | 70 | 2020-01-17 | 0.3080 | 75.055 | 2020s |
| 170652 | 0.6420 | 2020 | 0.13200 | ['KEVVO', 'J Balvin'] | 0.856 | 189507 | 0.721 | 1 | 7HmnJHfs0BkFzX4x8j0hkl | 0.004710 | 7 | 0.1820 | -4.928 | 1 | Billetes Azules (with J Balvin) | 74 | 2020-10-16 | 0.1080 | 94.991 | 2020s |
170653 rows × 20 columns
song_cluster_labels = song_cluster_pipeline.predict(X)
data['cluster_label'] = song_cluster_labels
# Visualizing the Clusters with PCA
from sklearn.decomposition import PCA
pca_pipeline = Pipeline([('scaler', StandardScaler()), ('PCA', PCA(n_components=2))])
song_embedding = pca_pipeline.fit_transform(X)
projection = pd.DataFrame(columns=['x', 'y'], data=song_embedding)
projection['title'] = data['name']
projection['cluster'] = data['cluster_label']
fig = px.scatter(
projection, x='x', y='y', color='cluster', hover_data=['x', 'y', 'title'])
fig.show()